Exploratory Analysis: Graphics

Histograms

## boxplot for asthma rates for states with high and low asthma prev
ggplot(df, aes(x = adj_prevalence)) + geom_histogram() + geom_density() + theme_classic() +
  labs( x = "Age-adjusted Asthma Prevalence (>= 18 Years Old)",
        y = "Count", 
        title = "Distribution of Asthma Prevalence for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## boxplot for ozone rates for state with high and low asthma prev
ggplot(df, aes(x = yearly_avg_ozone)) + geom_histogram() + geom_density() + theme_classic() +
  labs( x = "Yearly Average Ozone Pollution",
        y = "Count", 
        title = "Distribution of Ozone Pollution for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## boxplot for pm25 rates for state with high and low asthma prev
ggplot(df, aes(x = yearly_avg_pm25)) + geom_histogram() + geom_density() + theme_classic() +
  labs( x = "Yearly Average PM 2.5 Pollution",
        y = "Count", 
        title = "Distribution of PM 2.5 Pollution for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## boxplot for obesity rates for state with high and low asthma prev
ggplot(df, aes(x = adj_obesity)) + geom_histogram() + geom_density() + theme_classic() +
  labs( x = "Age-adjusted Obesity Prevalence (>= 18 Years Old)",
        y = "Count", 
        title = "Distribution of Obesity Prevalence for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## boxplot for smoking rates for state with high and low asthma prev
ggplot(df, aes(x = adj_smoking)) + geom_histogram() + geom_density() + theme_classic() +
  labs( x = "Age-adjusted Smoking Prevalence (>= 18 Years Old)",
        y = "Count", 
        title = "Distribution of Smoking Prevalence for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Boxplots

## boxplot for asthma rates for states with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = adj_prevalence, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
  theme(legend.position = "none") +
  labs( x = "Regions Across the United States",
        y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)", 
        title = "Distribution of Asthma Prevalence Across United States Regions") 

## boxplot for ozone rates for state with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = yearly_avg_ozone, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
  theme(legend.position = "none") +
  labs( x = "Regions Across the United States",
        y = "2016 Year Average Ozone Pollution", 
        title = "Distribution of Ozone Pollution Prevalence Across United States Regions") 

## boxplot for pm25 rates for state with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = yearly_avg_pm25, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
  theme(legend.position = "none") +
  labs( x = "Regions Across the United States",
        y = "2016 Year Average PM 2.5 Pollution", 
        title = "Distribution of PM 2.5 Pollution Prevalence Across United States Regions") 

## boxplot for obesity rates for state with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = adj_obesity, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
  theme(legend.position = "none") +
  labs( x = "Regions Across the United States",
        y = "Age-adjusted Obesity Prevalence (>= 18 Years Old)", 
        title = "Distribution of Ozone Pollution Prevalence Across United States Regions") 

## boxplot for smoking rates for state with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = adj_smoking, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
  theme(legend.position = "none") +
  labs( x = "Regions Across the United States",
        y = "Age-adjusted Smoking Prevalence (>= 18 Years Old)",
        title = "Distribution of Ozone Pollution Prevalence Across United States Regions") 

Scatter Plots

## Asthma Prevalence and Ozone
ggplot(df, aes(x = yearly_avg_ozone, y = adj_prevalence, color = Region_Name)) + geom_point() + scale_colour_brewer(palette = "Set1") + theme_classic() +  #geom_density_2d_filled()
  theme(legend.title = element_blank(), legend.position = "bottom") +
  labs( x = "2016 Year Average Ozone Pollution",
        y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)", 
        title = "Scatter Plot of Ozone Pollution And Asthma Prevalence") 

## Asthma Prevalence and PM25
ggplot(df, aes(x = yearly_avg_pm25, y = adj_prevalence, color = Region_Name)) + geom_point() + scale_colour_brewer(palette = "Set1") + theme_classic() +  #geom_density_2d_filled()
  theme(legend.title = element_blank(), legend.position = "bottom") +
  labs( x = "2016 Year Average PM 2.5 Pollution",
        y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)", 
        title = "Scatter Plot of PM 2.5 Pollution And Asthma Prevalence") 

## Asthma Prevalence and Smoking 
ggplot(df, aes(x = adj_smoking, y = adj_prevalence, color = Region_Name)) + geom_point() + scale_colour_brewer(palette = "Set1") + theme_classic() +  #geom_density_2d_filled()
  theme(legend.title = element_blank(), legend.position = "bottom") +
  labs( x = "Age-adusted Smoking Prevalence (>= 18 Years Old)",
        y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)", 
        title = "Scatter Plot of Smoking Prevalence And Asthma Prevalence")

## Asthma Prevalence and Obesity
ggplot(df, aes(x = adj_obesity, y = adj_prevalence, color = Region_Name)) + geom_point() + scale_colour_brewer(palette = "Set1") + theme_classic() +  #geom_density_2d_filled()
  theme(legend.title = element_blank(), legend.position = "bottom") +
  labs( x = "Age-adusted Obesity Prevalence (>= 18 Years Old)",
        y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)", 
        title = "Scatter Plot of Obesity Prevalence And Asthma Prevalence")

Linear Models

The model that is going to be use for the project includes Mod2b.

Univariate Linear Models

By comparing the AIC values (generally want a lower value), comparison of the values indicates the smoking model is the singular most influential factor for predicting asthma prevalence. This is further supported by enormous F value for that model. Given the availability of additional risk factors, multivariate models will be included. The residual plots for this model also line up appropriately/ideally.

Ozone

ozone.lm = lm(adj_prevalence ~ yearly_avg_ozone, data = df)
summary.aov(ozone.lm)
##                    Df Sum Sq Mean Sq F value   Pr(>F)    
## yearly_avg_ozone    1   30.1  30.096   30.36 3.92e-08 ***
## Residuals        2818 2793.7   0.991                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(ozone.lm)
## [1] 7982.367
plot(ozone.lm)

PM 2.5

pm25.lm = lm(adj_prevalence ~ yearly_avg_pm25, data = df)
summary.aov(pm25.lm)
##                   Df Sum Sq Mean Sq F value Pr(>F)    
## yearly_avg_pm25    1    123  123.04   128.4 <2e-16 ***
## Residuals       2818   2701    0.96                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(pm25.lm)
## [1] 7886.956
plot(pm25.lm)

Obesity

obesity.lm = lm(adj_prevalence ~ adj_obesity, data = df)
summary.aov(obesity.lm)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## adj_obesity    1  468.1   468.1     560 <2e-16 ***
## Residuals   2818 2355.6     0.8                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(obesity.lm)
## [1] 7501.43
plot(obesity.lm)

Smoking

smoking.lm = lm(adj_prevalence ~ adj_smoking, data = df)
summary.aov(smoking.lm)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## adj_smoking    1   1455  1454.6    2994 <2e-16 ***
## Residuals   2818   1369     0.5                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(smoking.lm)
## [1] 5971.25
plot(smoking.lm)

Multivariate Linear Models

The initial plan was preform backwards and then forward selection, however the first backward selection mode, all variables came back significant. This could be a function of having multiple data points, however regardless, then the interactions between the individual variables were explored.

The diagnostic plots for the multivariate models fit more appropriately than the any of the uni-variate models. The least complicated model with an idea AIC score is the the model that include the interactions between regions and obesity. This decision took consideration the diagnostics plots and the AIC values.

All Main Effect Variables (Regions)

mod1 = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_obesity + adj_smoking + Region_Name, data = df)
#summary(mod1)
summary.aov(mod1)
##                    Df Sum Sq Mean Sq F value Pr(>F)    
## yearly_avg_ozone    1   30.1    30.1   110.8 <2e-16 ***
## yearly_avg_pm25     1  103.1   103.1   379.7 <2e-16 ***
## adj_obesity         1  405.1   405.1  1492.2 <2e-16 ***
## adj_smoking         1  927.4   927.4  3415.6 <2e-16 ***
## Region_Name         7  595.7    85.1   313.4 <2e-16 ***
## Residuals        2808  762.4     0.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod1)
## [1] 4340.202
plot(mod1)

Interactions … between Region and Smoking
mod2a = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_obesity+ Region_Name * adj_smoking, data = df)
#summary(mod2a)
summary.aov(mod2a)
##                           Df Sum Sq Mean Sq F value Pr(>F)    
## yearly_avg_ozone           1   30.1    30.1  122.28 <2e-16 ***
## yearly_avg_pm25            1  103.1   103.1  418.87 <2e-16 ***
## adj_obesity                1  405.1   405.1 1646.02 <2e-16 ***
## Region_Name                7  798.8   114.1  463.65 <2e-16 ***
## adj_smoking                1  724.2   724.2 2942.25 <2e-16 ***
## Region_Name:adj_smoking    7   73.0    10.4   42.36 <2e-16 ***
## Residuals               2801  689.4     0.2                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod2a)
## [1] 4070.463
plot(mod2a)

Interactions …between Region and obesity
mod2b = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_smoking + Region_Name * adj_obesity, data = df)
#summary(mod2b)
summary.aov(mod2b)
##                           Df Sum Sq Mean Sq F value Pr(>F)    
## yearly_avg_ozone           1   30.1    30.1  117.20 <2e-16 ***
## yearly_avg_pm25            1  103.1   103.1  401.48 <2e-16 ***
## adj_smoking                1 1330.5  1330.5 5181.07 <2e-16 ***
## Region_Name                7  554.0    79.1  308.21 <2e-16 ***
## adj_obesity                1   43.6    43.6  169.94 <2e-16 ***
## Region_Name:adj_obesity    7   43.1     6.2   23.98 <2e-16 ***
## Residuals               2801  719.3     0.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod2b)
## [1] 4190.079
plot(mod2b)

Interactions … between Region and smoking AND Region and obesity
mod2c = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_smoking + Region_Name * adj_smoking + Region_Name * adj_obesity, data = df)
#summary(mod2c)
summary.aov(mod2c)
##                           Df Sum Sq Mean Sq F value Pr(>F)    
## yearly_avg_ozone           1   30.1    30.1  136.27 <2e-16 ***
## yearly_avg_pm25            1  103.1   103.1  466.81 <2e-16 ***
## adj_smoking                1 1330.5  1330.5 6024.12 <2e-16 ***
## Region_Name                7  554.0    79.1  358.36 <2e-16 ***
## adj_obesity                1   43.6    43.6  197.59 <2e-16 ***
## adj_smoking:Region_Name    7   73.0    10.4   47.20 <2e-16 ***
## Region_Name:adj_obesity    7   72.3    10.3   46.79 <2e-16 ***
## Residuals               2794  617.1     0.2                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod2c)
## [1] 3771.881
plot(mod2c)

Interactions … between Region and log(smoking) AND Region and obesity
mod2d = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_smoking + Region_Name * log(adj_smoking) + Region_Name * adj_obesity, data = df)
#summary(mod2d)
summary.aov(mod2d)
##                                Df Sum Sq Mean Sq F value Pr(>F)    
## yearly_avg_ozone                1   30.1    30.1  137.53 <2e-16 ***
## yearly_avg_pm25                 1  103.1   103.1  471.12 <2e-16 ***
## adj_smoking                     1 1330.5  1330.5 6079.78 <2e-16 ***
## Region_Name                     7  554.0    79.1  361.67 <2e-16 ***
## log(adj_smoking)                1   25.9    25.9  118.13 <2e-16 ***
## adj_obesity                     1   47.5    47.5  217.07 <2e-16 ***
## Region_Name:log(adj_smoking)    7   51.7     7.4   33.75 <2e-16 ***
## Region_Name:adj_obesity         7   69.8    10.0   45.54 <2e-16 ***
## Residuals                    2793  611.2     0.2                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod2d)
## [1] 3746.935
plot(mod2d)

All Main Effect Variables (States)

mod3 = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_obesity + adj_smoking + state_desc, data = df)
#summary(mod3)
summary.aov(mod3)
##                    Df Sum Sq Mean Sq F value Pr(>F)    
## yearly_avg_ozone    1   30.1    30.1   264.8 <2e-16 ***
## yearly_avg_pm25     1  103.1   103.1   907.1 <2e-16 ***
## adj_obesity         1  405.1   405.1  3564.6 <2e-16 ***
## adj_smoking         1  927.4   927.4  8159.2 <2e-16 ***
## state_desc         42 1042.9    24.8   218.5 <2e-16 ***
## Residuals        2773  315.2     0.1                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod3)
## [1] 1919.159
plot(mod3)
## Warning: not plotting observations with leverage one:
##   4

Interactions … between State and obesity
mod4 = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_smoking + state_desc * adj_obesity, data = df)
#summary(mod4)
summary.aov(mod4)
##                          Df Sum Sq Mean Sq  F value Pr(>F)    
## yearly_avg_ozone          1   30.1    30.1   310.39 <2e-16 ***
## yearly_avg_pm25           1  103.1   103.1  1063.31 <2e-16 ***
## adj_smoking               1 1330.5  1330.5 13721.95 <2e-16 ***
## state_desc               42 1025.0    24.4   251.71 <2e-16 ***
## adj_obesity               1   19.9    19.9   204.76 <2e-16 ***
## state_desc:adj_obesity   41   50.3     1.2    12.65 <2e-16 ***
## Residuals              2732  264.9     0.1                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod4)
## [1] 1511.104
plot(mod4)
## Warning: not plotting observations with leverage one:
##   4

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced